source("ingest_data.R")
levels(data$repo)
## [1] "IntTest" "Jupiter" "Mars" "Mercury" "Neptune" "Saturn" "Uranus"
## [8] "Venus"
levels(data$committerteam)
## [1] "Arch" "Blue" "Brown" "Green" "Orange" "Pink" "Red"
## [8] "UI" "Unknown" "Violet" "Yellow"
levels(data$authorteam)
## [1] "Arch" "Blue" "Brown" "Green" "Orange" "Pink" "Red"
## [8] "UI" "Unknown" "Violet" "Yellow"
We have 10 teams (plus one “Unknown”) working in 8 repositories.
Introduced duplicates in files, per repo.
low_stat <- function(x) quantile(x, 0.25, type=3)
median_stat <- function(x) quantile(x, 0.5, type=3)
high_stat <- function(x) quantile(x, 0.75, type=3)
q95_stat <- function(x) q95(x)
very_high_stat <- function(x) quantile(x, 0.99, type=3)
max_stat <- function(x) max(x)
m1 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, min)) |> pivot_longer(cols=everything(), values_to="min")
m2 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, low_stat)) |> pivot_longer(cols=everything(), values_to="q25")
m3 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, median_stat)) |> pivot_longer(cols=everything(), values_to="median")
m4 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, high_stat)) |> pivot_longer(cols=everything(), values_to="q75")
m5 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, q95_stat)) |> pivot_longer(cols=everything(), values_to="q95")
m6 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, very_high_stat)) |> pivot_longer(cols=everything(), values_to="q99")
m7 <- data |> select(ADD, DEL, COMPLEX, DUP, INTROD) |> summarise(across(ADD:INTROD, max)) |> pivot_longer(cols=everything(), values_to="max")
metrics <- merge(
merge(
merge(
merge(
merge(
merge(m1, m2, by="name"),
m3, by="name"),
m4, by="name"),
m5, by="name"),
m6, by="name"),
m7, by="name")
metrics
## name min q25 median q75 q95 q99 max
## 1 ADD 0 1 6 28 143 370 3772
## 2 COMPLEX 0 3 16 52 282 633 1244
## 3 DEL 0 0 2 12 92 311 3413
## 4 DUP 0 0 0 2 36 99 664
## 5 INTROD 0 0 0 0 1 5 150
(p <- data |> filter(INTROD > 0) |> group_by(repo) |> ggplot(aes(x=INTROD)) + geom_histogram(binwidth=1) + facet_wrap(~ repo) + ylab("Number of introduced duplicates in each file change")
)
Most of the introduced duplicates are small (single-digits), but some are large, ranging into the hundreds for the IntTest repo.
p + scale_y_continuous(limits=c(0,30))
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Limiting the scale to 30 shows the less frequently occurring values more clearly.
Proportion of file changes that introduce at least one clone, per team and repository. Size of the dot is proportional to the number of files changed by the team in the repository.
changesPerRepoAndTeam <- data |> group_by(repo, committerteam) |> summarise(fileschanged=n())
zerosPerRepoAndTeam <- data |> filter(INTROD == 0) |> group_by(repo, committerteam) |> summarise(zeros=n())
zeros_ratio <- merge(changesPerRepoAndTeam, zerosPerRepoAndTeam) |> mutate(introdRatio = 1-(zeros/fileschanged)) |> arrange(introdRatio)
(p <- zeros_ratio |> filter(committerteam != "Unknown") |> ggplot(aes(x=committerteam, y=introdRatio, color=committerteam, size=fileschanged)) + geom_point() + facet_wrap(~ repo) + xlab("team") + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + scale_y_continuous(limits=c(0,0.27)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
)
As we cannot ascertain which team the Unknown team members participate in, we exclude them from this metric.
Viewing the (log-transformed) added and removed lines, we see some correlation, and also quite a few “pure additions” and “pure removals” (along the x and y axes, respectively).
data |> group_by(repo) |> ggplot(aes(x=logADD, y=logDEL, color=committerteam)) + geom_point() + facet_wrap(~repo)
We see that, overall, there are more duplicates present in the IntTest and Jupiter repos (largest one). And there seems to be at least some indications of correlation with complexity (although there are also some duplicates in files where complexity is 0, i.e. the y axis).
data |> group_by(repo) |> ggplot(aes(x=logCOMPLEX, y=logDUP)) + geom_point() + facet_wrap( ~ repo) + scale_color_manual(values=COLOR_BY_TEAM) + theme_bw()
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
A more thorough pairs plot reveal that the parameters seem quite independent - at least no obvious correlations are present (though logCOMPLEX and logDUP are somewhat related, as seen above). Plot can be repeated per repo or per team, with the same conclusions.
ggpairs(data |> select(logADD, logDEL, logCOMPLEX, logDUP))
# quantile type 3 to avoid having interpolation of the observations
repo_committerteam <- data |> group_by(repo, committerteam) |> summarize(q95=quantile(INTROD, 0.95, type=3),
q99=quantile(INTROD, 0.99, type=3),
max=max(INTROD),
files=n(),
mean_added=mean(ADD),
median_added=median(ADD),
mean_removed=mean(DEL),
median_removed=median(DEL)) |> mutate(team=committerteam)
repo_committerteam |> ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=files)) +
geom_text(aes(label=round(files, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste("Observed number of changes to files by team and repo"))
Repositories are named and sized (though not to scale) after the seven neighboring planets, plus the INTTEST repository, which contains integration tests developed in Java.
We note that some teams have not changed files in some repostories, and that some teams have made very many, and some very few, changes to some repositories.
repo_committerteam |> ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=mean_added)) +
geom_text(aes(label=round(mean_added, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste("Observed mean number of added lines to files by team and repo"))
repo_committerteam |> ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=median_added)) +
geom_text(aes(label=round(median_added, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste("Observed median number of added lines to files by team and repo"))
repo_committerteam |> ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=mean_removed)) +
geom_text(aes(label=round(mean_removed, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste("Observed mean number of removed lines to files by team and repo"))
repo_committerteam |> ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=median_removed)) +
geom_text(aes(label=round(median_removed, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste("Observed median number of removed lines to files by team and repo"))
We note that adding lines are more common than removing, and that the Architect team leads in removing lines in most repositories.
The number of introduced duplicates follow a Poisson-like (or Negative Binomial) distribution, once you exclude all the zeros, for the zero-inflation part.
plot_introd_issues_in_repo <- function(aRepo, team_selector) {
data |> filter(repo == aRepo, team_selector(committerteam)) |> mutate(team = committerteam) |> group_by(team) |> ggplot(aes(x=INTROD, color=team, fill=team)) + geom_histogram(binwidth = 1) + facet_wrap(~ team) + scale_fill_manual(name="team", values=teamcolors) + scale_colour_manual(name="team", values=teamcolors) + ylab("number of changed files") + xlab("Number of duplicates introduced in the change") + ylim(0,65)
}
(p <- plot_introd_issues_in_repo(JUPITER, function(x) x %in% c(RED, BLUE, GREEN, ARCH)) + theme_bw() + theme(legend.position = "bottom"))
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
plot_introd_per_team <- function(aRepo, team_selector) {
data |> filter(repo == aRepo, team_selector(committerteam)) |> mutate(team = committerteam) |> group_by(team) |> ggplot(aes(x=INTROD, color=team, fill=team)) + geom_histogram(aes(y=..density..), binwidth = 1) + facet_wrap(~ team) + ggtitle(paste("Introduced duplicates in repo", aRepo)) + scale_fill_manual(name="team", values=teamcolors) + scale_colour_manual(name="team", values=teamcolors) + ylab("proportion of filechanges") + xlab("Number of duplicates introduced in the change")
}
(p <- plot_introd_per_team(JUPITER, function(x) x %in% c(ARCH, RED, BLUE, GREEN)) + theme_bw() + scale_y_continuous(limits=c(0,0.04)) )
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
(p <- plot_introd_per_team(INTTEST, function(x) x %in% c(ARCH, RED, BLUE, GREEN)) + theme_bw() + scale_y_continuous(limits=c(0,0.04)) )
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
(p <- plot_introd_per_team(URANUS, function(x) x %in% c(ARCH, RED, BLUE, GREEN)) + theme_bw() + scale_y_continuous(limits=c(0,0.06)) )
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
(p <- plot_introd_per_team(VENUS, function(x) x %in% c(ARCH, RED, BLUE, GREEN)) + theme_bw() + scale_y_continuous(limits=c(0,0.04)) )
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
data |> filter(repo == INTTEST, committerteam == ARCH) |> group_by(INTROD) |> tally()
## # A tibble: 4 × 2
## INTROD n
## <int> <int>
## 1 0 238
## 2 1 1
## 3 2 1
## 4 4 1
data |> filter(repo == VENUS, committerteam == ARCH) |> group_by(INTROD) |> tally()
## # A tibble: 1 × 2
## INTROD n
## <int> <int>
## 1 0 86
data |> filter(repo == VENUS, committerteam == GREEN) |> group_by(INTROD) |> tally()
## # A tibble: 3 × 2
## INTROD n
## <int> <int>
## 1 0 347
## 2 1 2
## 3 2 2
data |> filter(committerteam %in% c(BLUE, GREEN), repo == URANUS) |> group_by(committerteam) |> summarize(median(COMPLEX))
## # A tibble: 2 × 2
## committerteam `median(COMPLEX)`
## <fct> <dbl>
## 1 Blue 14
## 2 Green 16
data |> filter(committerteam %in% c(BLUE, GREEN), repo == MARS) |> group_by(committerteam, INTROD) |> summarize(n=n(), median(COMPLEX)) |> mutate(freq=n/sum(n))
## # A tibble: 13 × 5
## # Groups: committerteam [2]
## committerteam INTROD n `median(COMPLEX)` freq
## <fct> <int> <int> <dbl> <dbl>
## 1 Blue 0 730 6.5 0.952
## 2 Blue 1 18 26.5 0.0235
## 3 Blue 2 16 29 0.0209
## 4 Blue 4 1 16 0.00130
## 5 Blue 6 1 48 0.00130
## 6 Blue 14 1 106 0.00130
## 7 Green 0 561 6 0.862
## 8 Green 1 65 8 0.0998
## 9 Green 2 10 15 0.0154
## 10 Green 3 10 22 0.0154
## 11 Green 4 3 2 0.00461
## 12 Green 5 1 18 0.00154
## 13 Green 10 1 2 0.00154
data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo == JUPITER) |> group_by(committerteam, INTROD) |> summarize(n=n()) |> mutate(freq=n/sum(n))
## # A tibble: 40 × 4
## # Groups: committerteam [4]
## committerteam INTROD n freq
## <fct> <int> <int> <dbl>
## 1 Arch 0 594 0.953
## 2 Arch 1 23 0.0369
## 3 Arch 2 3 0.00482
## 4 Arch 4 2 0.00321
## 5 Arch 6 1 0.00161
## 6 Blue 0 2006 0.945
## 7 Blue 1 61 0.0287
## 8 Blue 2 23 0.0108
## 9 Blue 3 17 0.00801
## 10 Blue 4 8 0.00377
## # ℹ 30 more rows
There are also differences between teams — we see that the ARCH team in this repo introduced very few duplicates, whereas the Blue and Green teams were more comparable.
The pattern is even more pronounced in the IntTest repo
data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo == URANUS) |> group_by(committerteam, INTROD) |> summarize(n=n()) |> mutate(freq=n/sum(n))
## # A tibble: 19 × 4
## # Groups: committerteam [4]
## committerteam INTROD n freq
## <fct> <int> <int> <dbl>
## 1 Arch 0 116 0.967
## 2 Arch 1 4 0.0333
## 3 Blue 0 968 0.925
## 4 Blue 1 44 0.0420
## 5 Blue 2 23 0.0220
## 6 Blue 3 5 0.00478
## 7 Blue 4 4 0.00382
## 8 Blue 5 2 0.00191
## 9 Blue 17 1 0.000955
## 10 Green 0 538 0.921
## 11 Green 1 33 0.0565
## 12 Green 2 10 0.0171
## 13 Green 3 2 0.00342
## 14 Green 5 1 0.00171
## 15 Red 0 348 0.902
## 16 Red 1 19 0.0492
## 17 Red 2 11 0.0285
## 18 Red 3 7 0.0181
## 19 Red 4 1 0.00259
(p <- data |> filter(committerteam %in% c(ARCH, BLUE, RED, BROWN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam, INTROD) |> ggplot(aes(x=INTROD, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) +
xlab("Maximum number of introduced duplicates") +
ggtitle("Observed cumulative distribution of introduced duplicates") +
scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + scale_y_continuous(limits = c(0.85,1.0))
)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_step()`).
(p <- data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam) |> ggplot(aes(x=COMPLEX, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) + scale_x_continuous(trans="log1p", breaks=c(0,1,3,10,50,200, 1000)) +
scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() +
xlab("Existing complexity in changed file (log scale)") +
ggtitle("Observed cumulative frequency of existing complexity")
)
In the Jupiter repo, Team Blue are more likely to make changes in less complex files, relative to the Red team. Same for Arch team. In the Neptune repo, the Arch team likewise are less likely to change complex files, but the Blue and Red teams are more similar.
data |> summarize(max(ADD), max(DEL), max(COMPLEX), max(DUP))
## max(ADD) max(DEL) max(COMPLEX) max(DUP)
## 1 3772 3413 1244 664
data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(JUPITER)) |> group_by(committerteam) |> summarize(n(), max(ADD), max(DEL), max(COMPLEX), max(DUP))
## # A tibble: 4 × 6
## committerteam `n()` `max(ADD)` `max(DEL)` `max(COMPLEX)` `max(DUP)`
## <fct> <int> <int> <int> <int> <int>
## 1 Arch 623 489 741 1244 44
## 2 Blue 2123 734 730 1096 53
## 3 Green 1172 1374 1575 1122 38
## 4 Red 2166 2439 683 1244 53
data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(JUPITER)) |> group_by(committerteam) |> summarize(n(), q95(ADD), q95(DEL), q95(COMPLEX), q95(DUP))
## # A tibble: 4 × 6
## committerteam `n()` `q95(ADD)` `q95(DEL)` `q95(COMPLEX)` `q95(DUP)`
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Arch 623 66 181. 103 20
## 2 Blue 2123 77 38.9 101 16
## 3 Green 1172 135. 64 313 23
## 4 Red 2166 133 59 304 21
(p <- data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam) |> ggplot(aes(x=DUP, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) + scale_x_continuous(trans="log1p", breaks=c(0,1,3,10,50,200, 1000)) +
scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() +
xlab("Duplicates in changed file (log scale)") +
ggtitle("Observed cumulative frequency of existing duplicates")
)
(p <- data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam) |> ggplot(aes(x=ADD, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) +
scale_x_continuous(trans="log1p", breaks=c(0,1,3,10,50,200, 1000)) + #, guide=guide_axis(angle=90)) +
scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() +
xlab("Number of added lines (log scale)") +
ggtitle("Observed cumulative frequency of added lines")
)
(p <- data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam) |> ggplot(aes(x=DEL, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) +
scale_x_continuous(trans="log1p", breaks=c(0,1,3,10,50,200, 1000, 4000)) +
scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() +
xlab("Number of deleted lines (log scale)") +
ggtitle("Observed cumulative frequency of deleted lines") #+ scale_y_continuous(limits = c(0.9,1))
)
quantile_table <- function(df) {
df |> mutate(team=committerteam) |> group_by(repo, team) |> summarize(n=n(), q50ADD=median(ADD), q50DEL=median(DEL), q50COMP=median(COMPLEX), q50DUP=median(DUP), q75ADD=quantile(ADD, 0.75, type=3), q75DEL=quantile(DEL, 0.75, type=3), q75COMP=quantile(COMPLEX, 0.75, type=3), q75DUP=quantile(DUP, 0.75, type=3))
}
quantile_table(data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)))
## # A tibble: 12 × 11
## # Groups: repo [3]
## repo team n q50ADD q50DEL q50COMP q50DUP q75ADD q75DEL q75COMP q75DUP
## <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 IntTest Arch 241 2 3 36 7 16 29 163 55
## 2 IntTest Blue 1030 8 3 58 8 41 14 167 54
## 3 IntTest Green 727 3 2 65 22 13 7 183 65
## 4 IntTest Red 617 10 4 48 13 47 25 211 66
## 5 Jupiter Arch 623 3 12 16 0 12 34 33 4
## 6 Jupiter Blue 2123 11 2 10 0 28 8 32 1
## 7 Jupiter Green 1172 7 3 21 0 27 11 58 3
## 8 Jupiter Red 2166 5 5 18 0 18 15 49 2
## 9 Uranus Arch 120 2 8 22 0 8 45 33 1
## 10 Uranus Blue 1047 12 2 14 0 37 9 36 1
## 11 Uranus Green 584 6 2 16 0 26 12 35 1
## 12 Uranus Red 386 11 2 10 0 54 14 25 0
data |> filter(committerteam %in% c(ARCH, BLUE, RED), repo == JUPITER, DEL <= 3) |> group_by(committerteam) |> tally()
## # A tibble: 3 × 2
## committerteam n
## <fct> <int>
## 1 Arch 168
## 2 Blue 1289
## 3 Red 907
data |> filter(committerteam %in% c(ARCH, BLUE, RED, GREEN), repo %in% c(INTTEST, JUPITER, URANUS)) |> group_by(repo, committerteam, COMPLEX, logCOMPLEX) |> ggplot(aes(x=COMPLEX, color=committerteam)) + stat_ecdf() + facet_wrap(~ repo) +
scale_color_manual(values=COLOR_BY_TEAM) + theme_bw() + scale_x_continuous(trans="log1p", breaks=c(0,3,10,50,200, 1000))
three_teams <- function(x) x %in% c(ARCH, RED, BLUE, GREEN)
(p <- plot_introd_issues_in_repo(INTTEST, three_teams) )
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
There seem to be some team-level variation, as well as repo-level variation in the number of introduced duplicates.
Comparing the largest repo, and the three largest contributors to that repo reveals that the Blue team is more likely to introduce a small amount of duplicates (it has almost double the amount of single-added duplicates as the Red team, which has a similar amount of contributions). But the red team has some more occurrences of 4-8 duplicates added to a single file.
(p <- plot_introd_issues_in_repo(JUPITER, function(x) x %in% c(RED, GREEN,BLUE)) )
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_bar()`).
repo_committerteam |> ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=q95)) +
geom_text(aes(label=round(q95, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste("Observed 95% introductions by team and repo"))
The 95% quantile plots show that for many repos, and many teams in those repos, in 19 out of 20 file changes, no duplicates were added. The outlier is INTTEST, where many teams can be expected to introduce single-digit duplicates. In the NEPTUNE repository, we also can expect some more duplicate introduction than in the others.
repo_committerteam |> ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=q99)) +
geom_text(aes(label=round(q99, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste("Observed 99% introductions by team and repo"))
The pattern repeats for the 99% quantile. Integration test can be expected to have more duplicates introduced, and in particular the Brown team stands out. In the Neptune repo, however, it is the Blue and Green teams that are a bit more likely to introduce duplicates.
We also note that the Architect team are unlikely to introduce duplicates.
repo_committerteam|> ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=max)) +
geom_text(aes(label=round(max, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste("Observed max number of introduced duplicates by team and repo"))
repo_committerteam_summary <- repo_committerteam |> mutate(files=replace_na(files, 0)) |> group_by(repo) |> summarize(mx=max(files), mn=min(files))
(p <- repo_committerteam |> inner_join(repo_committerteam_summary) |> mutate(p=(files - mn)/(mx-mn)) |>
ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=p)) +
geom_text(aes(label=round(files, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle("Observed number of changes to files by team and repo", "Heatmap colored by row")
)
## Joining with `by = join_by(repo)`
The Blue team are the most frequent in changing files, with Red and Green following. The Pink, UI and Unknown teams are much more distant.
repo_committerteam_summary <- repo_committerteam |> mutate(max=replace_na(max, 0)) |> group_by(repo) |> summarize(mx=max(max), mn=min(max))
(p <- repo_committerteam |> inner_join(repo_committerteam_summary) |> mutate(p=(max - mn)/(mx-mn)) |>
ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=p)) +
geom_text(aes(label=round(max, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle("Observed max number of introduced duplicates by team and repo", "Heatmap colored by row")
)
## Joining with `by = join_by(repo)`
The maximum number of introduced duplicates are more spread out between the teams - Blue, Red and Green all have the lead in one or two repositories. But Brown leads in two (including Integration tests, with the overall maximum of 150 introduced clones), and Yellow in Mercury, the smallest repository in terms of LOC.
repo_committerteam_summary <- repo_committerteam |> mutate(max=replace_na(q99, 0)) |> group_by(repo) |> summarize(mx=max(q99), mn=min(q99))
(p <- repo_committerteam |> inner_join(repo_committerteam_summary) |> mutate(p=(q99 - mn)/(mx-mn)) |>
ggplot(aes(x=team, y=repo)) +
geom_tile(aes(fill=p)) +
geom_text(aes(label=round(q99, 0)), color="white") +
xlab("team") + ylab("repo") +
ggtitle("Observed Q99 number of introduced duplicates by team and repo", "Heatmap colored by row")
)
## Joining with `by = join_by(repo)`
The 99th percentile is more spread out - but Brown still have a commanding lead in the Integration tests, where they introduced 103 duplicates in a single change.
data |> group_by(repo, committerteam) |> select(committer) |> distinct() |> tally() |> ggplot(aes(x=committerteam, y=repo)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste("Observed number of committers by team and repo"))
## Adding missing grouping variables: `repo`, `committerteam`
data |> group_by(repo, authorteam) |> select(author) |> distinct() |> tally() |> ggplot(aes(x=authorteam, y=repo)) +
geom_tile(aes(fill=n)) +
geom_text(aes(label=n), color="white") +
xlab("team") + ylab("repo") +
ggtitle(paste("Observed number of authors by team and repo"))
## Adding missing grouping variables: `repo`, `authorteam`
The maximum number of introduced duplicates reveals that the Brown team, which also had high 95% value in the IntTest repo, also introduced 150 duplicates, the overall max value, to a single file in the integration test repository. Overall, we see the pattern repeat:
data |> filter(INTROD >0) |> ggplot(aes(x=D, y=INTROD, color=C)) + geom_point()
data |> filter(INTROD >0) |> ggplot(aes(x=C, y=INTROD, color=D)) + geom_point()
Those plots show that complexity, rather than number of existing duplicates, are more linearly related to number of introduced duplicates. At the same time, complexity and existing duplicates are also related - it is just that even if DUP is zero, there is no strong evidence that INTROD is low or zero.
data |> filter(INTROD>0) |> ggplot(aes(x=C, y=D, color=INTROD)) + geom_point()
data |> ggplot(aes(x=repo, y=(1+ADD))) + geom_violin() + geom_boxplot(width=0.1) + scale_y_continuous(trans='log', breaks=c(5, 20, 100, 500, 1000, 3000)) + ylab("Added lines")
Overall, at the median, every repository have less than 10 added lines. Some outliers occur.
data |> ggplot(aes(x=committerteam, y=(1+ADD))) + geom_violin() + geom_boxplot(width=0.1) + scale_y_continuous(trans='log', breaks=c(5, 20, 100, 500, 1000, 3000)) + ylab("Added lines")
Between the teams (regardless of repository), there are more variation.
data |> ggplot(aes(x=committerteam, y=(1+DEL))) + geom_violin() + geom_boxplot(width=0.1) + scale_y_continuous(trans='log', breaks=c(5, 20, 100, 500, 1000, 3000)) + ylab("Deleted lines")
data |> ggplot(aes(x=committerteam, y=(1+DUP))) + geom_violin() + geom_boxplot(width=0.1) + scale_y_continuous(trans='log', breaks=c(5, 20, 100, 500)) + ylab("Existing duplicates")
data |> ggplot(aes(x=committerteam, y=(1+COMPLEX))) + geom_violin() + geom_boxplot(width=0.1) + scale_y_continuous(trans='log', breaks=c(5, 20, 100, 500, 1000)) + ylab("McCabe complexity")
Data definition:
The above definition means that, at minimum, one of either addcomplex or delcomplex is zero, for any given file change. The only way for the churn value to be zero is if an empty file (containing no lines) was removed. This happens four times in the data set.
ocam_data <- data
(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> dplyr::filter(repo == INTTEST)) + theme(legend.position = "none"))
ocam_data <- data
(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == INTTEST)) + theme_bw() )
(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == JUPITER)) + theme_bw() )
(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == SATURN)) + theme_bw() )
(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == URANUS)) + theme_bw() )
ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == NEPTUNE)) + theme_bw()
ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == VENUS)) + theme_bw()
(p <- ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == MARS)) + theme_bw())
ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo == MERCURY)) + theme_bw()
ocam_rank_repo(ocam_metrics(ocam_data) |> filter(repo %in% c(JUPITER, URANUS))) + theme_bw() + theme(legend.position = "none") + ylab(NULL)
Combining repos in the same plot allows for contrasting pictures, like this for Jupiter and Uranus.
We can test whether our data supports the conditional independencies implied by our DAG.
REM _||_ COMP | TEAM
and
REM _||_ DUP | TEAM
These would seem to imply that “knowing TEAM, makes REM and COMPLEX” independent (similar for DUP). Do our data support that claim?
d <- data |> select(y=INTROD,
A=A,
C=C,
D=D,
R=R,
team=committerteam,
repo=repo)
formula <- bf(C ~ R + team)
get_prior(data=d,
family=gaussian,
formula=formula)
## prior class coef group resp dpar nlpar lb ub
## (flat) b
## (flat) b R
## (flat) b teamBlue
## (flat) b teamBrown
## (flat) b teamGreen
## (flat) b teamOrange
## (flat) b teamPink
## (flat) b teamRed
## (flat) b teamUI
## (flat) b teamUnknown
## (flat) b teamViolet
## (flat) b teamYellow
## student_t(3, 0.1, 2.5) Intercept
## student_t(3, 0, 2.5) sigma 0
## source
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## default
## default
priors <- c(prior(normal(0, 0.5), class = Intercept),
prior(normal(0, 0.5), class = b),
prior(exponential(1), class = sigma)
)
(v <- validate_prior(prior=priors,
formula=formula,
data=d,
family=gaussian)
)
## prior class coef group resp dpar nlpar lb ub source
## normal(0, 0.5) b user
## normal(0, 0.5) b R (vectorized)
## normal(0, 0.5) b teamBlue (vectorized)
## normal(0, 0.5) b teamBrown (vectorized)
## normal(0, 0.5) b teamGreen (vectorized)
## normal(0, 0.5) b teamOrange (vectorized)
## normal(0, 0.5) b teamPink (vectorized)
## normal(0, 0.5) b teamRed (vectorized)
## normal(0, 0.5) b teamUI (vectorized)
## normal(0, 0.5) b teamUnknown (vectorized)
## normal(0, 0.5) b teamViolet (vectorized)
## normal(0, 0.5) b teamYellow (vectorized)
## normal(0, 0.5) Intercept user
## exponential(1) sigma 0 user
M_cond_ind <-
brm(data = d,
family = gaussian,
formula = formula,
prior = priors,
file = cachefile("eda-M_cond_ind_R_C"),
warmup = 1000,
iter = ITERATIONS,
chains = CHAINS,
cores = CORES,
backend="cmdstanr",
file_refit = "on_change",
threads = threading(THREADS),
save_pars = SAVE_PARS,
adapt_delta = ADAPT_DELTA)
summary(M_cond_ind)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: C ~ R + team
## Data: d (Number of observations: 31007)
## Draws: 4 chains, each with iter = 4000; warmup = 1000; thin = 1;
## total post-warmup draws = 12000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept -0.13 0.02 -0.18 -0.08 1.00 2875 4078
## R -0.01 0.01 -0.02 0.00 1.00 13552 9336
## teamBlue 0.04 0.03 -0.01 0.09 1.00 3134 5495
## teamBrown -0.18 0.03 -0.24 -0.12 1.00 3739 6261
## teamGreen 0.19 0.03 0.13 0.24 1.00 3341 5645
## teamOrange 0.23 0.03 0.17 0.29 1.00 3803 5988
## teamPink 0.27 0.05 0.17 0.36 1.00 5714 6958
## teamRed 0.31 0.03 0.25 0.36 1.00 3418 5696
## teamUI 0.64 0.12 0.40 0.88 1.00 9529 7916
## teamUnknown -0.19 0.05 -0.28 -0.10 1.00 5787 7629
## teamViolet 0.51 0.04 0.44 0.58 1.00 4337 7033
## teamYellow 0.08 0.03 0.02 0.14 1.00 3928 6672
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.99 0.00 0.98 0.99 1.00 16772 8598
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
We are only really interested in population-level effects of R on C here (ignoring the team coefficients). And this clearly shows that 0 is within the likely betas (-0.02 to 0)
formula <- bf(D ~ R + team)
get_prior(data=d,
family=gaussian,
formula=formula)
## prior class coef group resp dpar nlpar lb ub
## (flat) b
## (flat) b R
## (flat) b teamBlue
## (flat) b teamBrown
## (flat) b teamGreen
## (flat) b teamOrange
## (flat) b teamPink
## (flat) b teamRed
## (flat) b teamUI
## (flat) b teamUnknown
## (flat) b teamViolet
## (flat) b teamYellow
## student_t(3, -0.6, 2.5) Intercept
## student_t(3, 0, 2.5) sigma 0
## source
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## default
## default
priors <- c(prior(normal(0, 0.5), class = Intercept),
prior(normal(0, 0.5), class = b),
prior(exponential(1), class = sigma)
)
(v <- validate_prior(prior=priors,
formula=formula,
data=d,
family=gaussian)
)
## prior class coef group resp dpar nlpar lb ub source
## normal(0, 0.5) b user
## normal(0, 0.5) b R (vectorized)
## normal(0, 0.5) b teamBlue (vectorized)
## normal(0, 0.5) b teamBrown (vectorized)
## normal(0, 0.5) b teamGreen (vectorized)
## normal(0, 0.5) b teamOrange (vectorized)
## normal(0, 0.5) b teamPink (vectorized)
## normal(0, 0.5) b teamRed (vectorized)
## normal(0, 0.5) b teamUI (vectorized)
## normal(0, 0.5) b teamUnknown (vectorized)
## normal(0, 0.5) b teamViolet (vectorized)
## normal(0, 0.5) b teamYellow (vectorized)
## normal(0, 0.5) Intercept user
## exponential(1) sigma 0 user
M_cond_ind <-
brm(data = d,
family = gaussian,
formula = formula,
prior = priors,
file = cachefile("eda-M_cond_ind_R_D"),
warmup = 1000,
iter = ITERATIONS,
chains = CHAINS,
cores = CORES,
backend="cmdstanr",
file_refit = "on_change",
threads = threading(THREADS),
save_pars = SAVE_PARS,
adapt_delta = ADAPT_DELTA)
summary(M_cond_ind)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: D ~ R + team
## Data: d (Number of observations: 31007)
## Draws: 4 chains, each with iter = 4000; warmup = 1000; thin = 1;
## total post-warmup draws = 12000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 0.01 0.02 -0.03 0.06 1.00 2702 3996
## R 0.12 0.01 0.11 0.13 1.00 12194 8426
## teamBlue -0.09 0.03 -0.14 -0.04 1.00 2915 4761
## teamBrown -0.19 0.03 -0.25 -0.13 1.00 3585 5710
## teamGreen 0.10 0.03 0.05 0.16 1.00 3201 5638
## teamOrange -0.01 0.03 -0.07 0.05 1.00 3537 5977
## teamPink -0.10 0.05 -0.20 -0.00 1.00 5926 7085
## teamRed 0.05 0.03 -0.00 0.11 1.00 3197 5328
## teamUI 0.49 0.12 0.25 0.73 1.00 10398 8228
## teamUnknown -0.25 0.05 -0.34 -0.16 1.00 5866 6725
## teamViolet 0.21 0.04 0.15 0.28 1.00 4091 6485
## teamYellow 0.01 0.03 -0.05 0.07 1.00 3705 6102
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.99 0.00 0.98 0.99 1.00 16685 8748
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
There might be some weak correlation between DUP (D) and REM (R) beta 0.12 and CI between 0.11 and 0.13
plot_nontest_INTROD_for_team <- function(df, team) {
df |> filter(committerteam==team, ISTEST==F) |> ggplot(aes(x=INTROD, fill=repo)) + geom_histogram(aes(y=after_stat(count/sum(count))), binwidth = 1) + scale_y_continuous(labels=scales::percent) + ggtitle(paste0("Introduced duplicates in non-test code by team ", team))
}
plot_test_INTROD_for_team <- function(df, team) {
df |> filter(committerteam==team, ISTEST==T) |> ggplot(aes(x=INTROD, fill=repo)) + geom_histogram(aes(y=after_stat(count/sum(count))), binwidth = 1) + scale_y_continuous(labels=scales::percent) + ggtitle(paste0("Introduced duplicates in test code by team ", team))
}
plot_INTROD_for_team <- function(df, team) {
df |> filter(committerteam==team) |> ggplot(aes(x=INTROD, fill=repo)) + geom_histogram(aes(y=after_stat(count/sum(count))), binwidth = 1) + scale_y_continuous(labels=scales::percent) + ggtitle(paste0("Introduced duplicates by team ", team))
}
(p <- plot_nontest_INTROD_for_team(data, BLUE) + ylim(0,0.03) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_bar()`).
(p <- plot_test_INTROD_for_team(data, BLUE) + ylim(0,0.03) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).
(p <- plot_INTROD_for_team(data, BLUE) + ylim(0,0.03) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Some repositories (such as IntTest) have lots of test code. Adding the ISTEST predictor does not seem to add more information than the usual repository information (shapes of the histograms are similar).
(p <- plot_nontest_INTROD_for_team(data, RED) + ylim(0,0.025) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).
(p <- plot_test_INTROD_for_team(data, RED) + ylim(0,0.025) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_bar()`).
(p <- plot_INTROD_for_team(data, RED) + ylim(0,0.025) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Similar conclusion for Red.
(p <- plot_nontest_INTROD_for_team(data, GREEN) + ylim(0,0.05) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).
(p <- plot_test_INTROD_for_team(data, GREEN) + ylim(0,0.05) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_bar()`).
There’s a small amount of zeros in the Venus repository, visible in the plot as well (the other zeros are excluded by the ylim).
(p <- plot_INTROD_for_team(data, GREEN) + ylim(0,0.05) )
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Is it true that Architects rarely introduce new files in the Integration Tests?
data |> filter(repo == INTTEST) |> group_by(committerteam, ISNEW) |> summarize(count=n(), pct=100*count/sum(count)) |> mutate(ratio=round(100*count/sum(count), 1))
## # A tibble: 22 × 5
## # Groups: committerteam [11]
## committerteam ISNEW count pct ratio
## <fct> <lgl> <int> <dbl> <dbl>
## 1 Arch FALSE 227 100 94.2
## 2 Arch TRUE 14 100 5.8
## 3 Blue FALSE 937 100 91
## 4 Blue TRUE 93 100 9
## 5 Brown FALSE 308 100 92.8
## 6 Brown TRUE 24 100 7.2
## 7 Green FALSE 713 100 98.1
## 8 Green TRUE 14 100 1.9
## 9 Orange FALSE 269 100 95.4
## 10 Orange TRUE 13 100 4.6
## # ℹ 12 more rows
Not really… 14 new files out of 241 (5.8%), whereas Green and Orange also have 14 and 13, respectively, corresponding to 1.9% and 4.6%. So, no conclusive evidence that other teams introduce more integration test files than the Architects.
(p <- data |> filter(repo == JUPITER) |> group_by(committerteam, DEL) |> ggplot(aes(x=DEL, fill=committerteam)) + geom_histogram(binwidth=30) + facet_wrap(~ committerteam) + scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 5000)) +
ylab("log10(count)") + scale_fill_manual(values=COLOR_BY_TEAM) + theme_bw() + ggtitle("Deleted lines per team", "Jupiter repository")
)
Possibly more deletions done by Architects, however. But Red and Yellow also seem to delete a lot of lines.
(p <- data |> mutate(team=committerteam) |> filter(repo %in% c(JUPITER, URANUS), team %in% c(RED, BLUE)) |> ggplot(aes(y=ADD, x=DEL, size=logCOMPLEX, colour=team, shape=team)) + geom_point() +
scale_x_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) + scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) +
scale_color_manual(values=c(Blue=rgb(31,120,180,40, maxColorValue = 255), Red=rgb(227,26,28,40, maxColorValue = 255))) + theme_bw() + facet_wrap(~repo)+ ggtitle("Added and changed lines per team and repo, sized by complexity of the file") + theme(legend.position="bottom") )
Comparing how Red and Blue perform additions an deletions reveal that Red is more active in the Jupiter repo, and Blue in Uranus. Pure additions correspond to the \(y\)-axis, and pure deletions are the \(x\)-axis. Line changes correspond to the \(y=x\) line, which is clearly visible in both plots.
(p <- data |> filter(repo %in% c(JUPITER, URANUS, MARS), committerteam %in% c(RED, BLUE, GREEN)) |> ggplot(aes(y=ADD, x=DEL, size=logCOMPLEX, colour=committerteam, shape=committerteam)) + geom_point() +
scale_x_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) + scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) +
scale_color_manual(values=c(Blue=rgb(31,120,180,40, maxColorValue = 255), Red=rgb(227,26,28,40, maxColorValue = 255), Green=rgb(51,160,44,40, maxColorValue = 255))) + scale_alpha(0.1) + theme_bw() + facet_wrap(~repo)+ ggtitle("Added and changed lines per team and repo, sized by complexity of the file") )
Adding the Green team, and Mars repository shows that Green is active in Mars, but also in Uranus.
Hard to draw any certain conclusions from these plots, however.
data |> filter(repo == MARS, committerteam %in% c(BLUE, GREEN)) |> ggplot(aes(y=INTROD, x=ADD, colour=committerteam, size=COMPLEX)) + geom_point() +
scale_x_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) +
scale_color_manual(values=c(Blue=rgb(31,120,180,75, maxColorValue = 255), Red=rgb(227,26,28,40, maxColorValue = 255), Green=rgb(51,160,44,75, maxColorValue = 255))) +
theme_bw() + ggtitle("Introduced duplicates per added lines, sized by complexity of the file", "Mars repository")
data |> filter(repo == JUPITER, committerteam %in% c(BLUE, RED)) |> ggplot(aes(y=INTROD, x=ADD, colour=committerteam, size=COMPLEX)) + geom_point() +
scale_x_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) +
scale_color_manual(values=c(Blue=rgb(31,120,180,75, maxColorValue = 255), Red=rgb(227,26,28,40, maxColorValue = 255), Green=rgb(51,160,44,75, maxColorValue = 255))) +
theme_bw() + ggtitle("Introduced duplicates per added lines, sized by complexity of the file", "Jupiter repository")
data |> filter(repo == INTTEST, committerteam %in% c(BLUE, RED)) |> ggplot(aes(y=INTROD, x=ADD, colour=committerteam, size=COMPLEX)) + geom_point() +
scale_x_continuous(trans=scales::pseudo_log_trans(base = 10), breaks=c(1, 10,100,1000, 2000)) +
scale_color_manual(values=c(Blue=rgb(31,120,180,75, maxColorValue = 255), Red=rgb(227,26,28,40, maxColorValue = 255), Green=rgb(51,160,44,75, maxColorValue = 255))) +
theme_bw() + ggtitle("Introduced duplicates per added lines, sized by complexity of the file", "Int.test repository")
The tendency of introducing clones increase with the number of added lines, as expected.